示例#1
0
    def insert_many(self, table, input_data_list, ret_info=None, cols=None):
        "Insert many records into the table given by table_name."
        self.grab_session()

        # Resolve the table instance
        if isinstance(table, str):
            inputs = dict.fromkeys(self.get_column_names(table))
            table = self.tables[table]
        else:
            inputs = dict.fromkeys(self.get_column_names(table.__tablename__))

        # Set the default return info
        if ret_info is None:
            ret_info = inspect(table).primary_key[0].name

        # Prepare and insert the data
        entry_list = []
        for input_data in input_data_list:
            if cols:
                input_dict = zip(cols, input_data)
            else:
                input_dict = input_data
            inputs.update(input_dict)
            entry_list.append(table(**inputs))
            inputs = inputs.fromkeys(inputs)  # Clear the values of the dict.
        self.session.add_all(entry_list)
        self.commit("Excepted while trying to insert:\n%s,\ninto %s" %
                    (input_data_list, table.__tablename__))
        return self.get_values(entry_list, ret_info)
示例#2
0
def get_text_content_summary_string(q, db, num_ids=None):
    """Create a table with some summary data for a query."""
    N_tot = q.count()
    if num_ids is not None:
        logger.info("Found %d text content entires out of %d ids." %
                    (N_tot, num_ids))
    if N_tot > 0:
        log_n = floor(log10(N_tot))
    else:
        log_n = 1
    cols = list(formats.values()) + ['tot']
    col_fmt = ' %%%ds' % max(4, log_n)
    cols_strs = [col_fmt % fmt for fmt in cols]
    ret_str = 'Summary Statisitics:\n' + ' ' * 10 + ''.join(cols_strs) + '\n'
    col_counts = dict.fromkeys(formats.values())
    col_counts['tot'] = []
    for texttype in texttypes.values():
        line = '%8s: ' % texttype
        counts = []
        for text_format in cols[:-1]:
            if col_counts[text_format] is None:
                col_counts[text_format] = []
            c = q.filter(db.TextContent.text_type == texttype,
                         db.TextContent.format == text_format).count()
            line += col_fmt % c
            counts.append(c)
            col_counts[text_format].append(c)
        line += col_fmt % sum(counts)
        ret_str += line + '\n'
        col_counts['tot'].append(sum(counts))
    ret_str += '%8s: ' % 'total' + ''.join(
        [col_fmt % sum(col_counts[col]) for col in cols])
    return ret_str
示例#3
0
 def insert(self, tbl_name, ret_info='id', **input_dict):
     "Insert a an entry into specified table, and return id."
     self.grab_session()
     inputs = dict.fromkeys(self.get_columns(tbl_name))
     inputs.update(input_dict)
     new_entry = self.tables[tbl_name](**inputs)
     self.session.add(new_entry)
     self.commit("Excepted while trying to insert %s into %s" %
                 (inputs, tbl_name))
     return self.get_values([new_entry], ret_info)[0]
示例#4
0
 def test_wordlist(self):
     # Assert lazy loading Wordlist.
     v = en.wordlist.STOPWORDS
     self.assertTrue("the" in v)
     # Assert Wordlist to dict.
     v = dict.fromkeys(en.wordlist.STOPWORDS, True)
     self.assertTrue("the" in v)
     # Assert new Wordlist by adding other Wordlists.
     v = en.wordlist.STOPWORDS + en.wordlist.ACADEMIC
     self.assertTrue("the" in v)
     self.assertTrue("dr." in v)
     print("pattern.en.wordlist.Wordlist")
示例#5
0
    def insert(self, table, ret_info=None, **input_dict):
        "Insert a an entry into specified table, and return id."
        self.grab_session()
        # Resolve the table instance
        if isinstance(table, str):
            inputs = dict.fromkeys(self.get_column_names(table))
            table = self.tables[table]
        else:
            inputs = dict.fromkeys(self.get_column_names(table.__tablename__))

        # Get the default return info
        if ret_info is None:
            ret_info = inspect(table).primary_key[0].name

        # Do the insert
        inputs.update(input_dict)
        new_entry = table(**inputs)
        self.session.add(new_entry)
        self.commit("Excepted while trying to insert %s into %s" %
                    (inputs, table.__tablename__))
        return self.get_values([new_entry], ret_info)[0]
示例#6
0
文件: test_en.py 项目: clips/pattern
 def test_wordlist(self):
     # Assert lazy loading Wordlist.
     v = en.wordlist.STOPWORDS
     self.assertTrue("the" in v)
     # Assert Wordlist to dict.
     v = dict.fromkeys(en.wordlist.STOPWORDS, True)
     self.assertTrue("the" in v)
     # Assert new Wordlist by adding other Wordlists.
     v = en.wordlist.STOPWORDS + en.wordlist.ACADEMIC
     self.assertTrue("the" in v)
     self.assertTrue("dr." in v)
     print("pattern.en.wordlist.Wordlist")
示例#7
0
def test_multiple_text_ref_pmc_oa():
    "Test whether a duplicate text ref in pmc oa is handled correctly."
    db = get_db()
    pmc = PmcOA(ftp_url=TEST_FTP, local=True)
    inp = dict.fromkeys(pmc.tr_cols)
    inp.update(pmcid='PMC5579538', doi='10.1021/acsomega.7b00205')
    pmc.upload_batch(db, [inp], [])
    num_refs = len(db.select_all('text_ref'))
    pmc.upload_batch(db, [inp], [])
    assert len(db.select_all('text_ref')) == num_refs,\
        "Duplicate refs allowed to be submitted.."
    return
示例#8
0
 def insert_many(self, tbl_name, input_dict_list, ret_info='id'):
     "Insert many records into the table given by table_name."
     self.grab_session()
     inputs = dict.fromkeys(self.get_columns(tbl_name))
     entry_list = []
     for input_dict in input_dict_list:
         inputs.update(input_dict)
         entry_list.append(self.tables[tbl_name](**inputs))
         inputs = inputs.fromkeys(inputs)  # Clear the values of the dict.
     self.session.add_all(entry_list)
     self.commit("Excepted while trying to insert:\n%s,\ninto %s" %
                 (input_dict_list, tbl_name))
     return self.get_values(entry_list, ret_info)
示例#9
0
def find_cols(find, label, header, default_all=False):
    """
    Identify the column numbers for a given list of columns.

    @param find: a tuple of the column labels to look for
    @param label: a label for the search, used in error message
    @param header: list of column headers
    @param default_all: if result should default to all columns
        (defaults to False)
    @return dict
    """
    # set up columns to keep
    cols = dict()
    if find is None:
        if default_all:
            cols = dict.fromkeys(header)
    else:
        # check all find columns in header
        if any(f not in header for f in find):
            raise MyError("All '%s'-columns must be in header" % label)
        cols = dict.fromkeys(find)
    for c in cols:
        cols[c] = header.index(c)
    return cols
示例#10
0
    def updatePC(self):
        self.pcComboBox.clear()

        Salle = self.salleComboBox.currentText()
        listePC = []
        listePC.append("...")

        if self.chemin.strip() != "":

            with open(self.chemin, newline='') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=';', quotechar='|')

                next(spamreader, None)

                for row in spamreader:
                    infoPC = row[0]
                    if infoPC.split("-")[1] == Salle:
                        listePC.append(infoPC)

            listePC = list(dict.fromkeys(listePC))

            self.pcComboBox.addItems(listePC)
示例#11
0
    def statConnexionFilePushButtonPressed(self):
        #Supression des ellements dans la combobox
        self.salleComboBox.clear()

        #Récuperation du fichier a l'aide du widget QFileDialogue
        cheminTemp = QFileDialog.getOpenFileName(self, 'Choisir College', '',
                                                 '*.csv')
        self.chemin = cheminTemp[0]
        self.statConnexionFileLineEdit.setText(self.chemin)

        listeSalle = []
        numSalle = []
        numSalle.append("...")
        if self.chemin.strip() != "":

            #Lecture du fichier CSV
            with open(self.chemin, newline='') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=';', quotechar='|')

                next(spamreader, None)

                #Remplissage de la combobox PC
                for row in spamreader:
                    infoPC = row[0]
                    listeSalle.append(infoPC.split("-"))

                #Remplissage de la combobox Salle
                for row in listeSalle:
                    numeroSalle = row[1]
                    numSalle.append(numeroSalle)

            #Supression des doublons dans la liste Salle
            numSalle = list(dict.fromkeys(numSalle))

            #Ajout de la liste salle dans la combobox
            self.salleComboBox.addItems(numSalle)
示例#12
0
    def FindWordFrequency(self, DataSourceText):
        WordFrequencyRow = []
        result = self.text_preprocessing(DataSourceText)
        frequency_list, frequency = self.GenerateFrequencyList(result)

        total_count = 0
        # stop_words = set(stopwords.words('english'))

        for words in frequency_list:
            total_count += frequency[words]

        for words in frequency_list:
            syns = wn.synsets(words)

            synonyms = []
            antonyms = []
            i = 0

            if len(syns) != 0:
                for syn in syns:
                    if len(syn.lemmas()) != 0:
                        for l in syn.lemmas():
                            for w1 in frequency_list:
                                if w1 == l.name():
                                    synonyms.append(l.name())
                                    if (len(l.antonyms()) != 0):
                                        if l.antonyms():
                                            for w2 in frequency_list:
                                                if w2 == l.antonyms()[0].name(
                                                ):
                                                    antonyms.append(
                                                        l.antonyms()[0].name())
                                                    break

            if len(synonyms) == 0:
                synonyms.append('-')
            else:
                synonyms = list(dict.fromkeys(synonyms))

            if len(antonyms) == 0:
                antonyms.append('-')
            else:
                antonyms = list(dict.fromkeys(antonyms))

            weighted_percentage = round((frequency[words] / total_count) * 100,
                                        2)

            if len(syns) != 0:
                WordFrequencyRow.append([
                    words,
                    len(words), frequency[words], weighted_percentage,
                    syns[0].definition(), ', '.join(set(synonyms)),
                    ', '.join(set(antonyms))
                ])
            else:
                WordFrequencyRow.append([
                    words,
                    len(words), frequency[words], weighted_percentage, '-',
                    ', '.join(set(synonyms)), ', '.join(set(antonyms))
                ])

        return WordFrequencyRow
示例#13
0
    "ugly": "ugli",
    "early": "earli",
    "only": "onli",
    "singly": "singl"
}

# Words that are never stemmed:
uninflected = dict.fromkeys(
    [
        "sky",
        "news",
        "howe",
        "inning",
        "outing",
        "canning",
        "proceed",
        "exceed",
        "succeed",
        "atlas",
        "cosmos",
        "bias",
        "andes"  # not plural forms
    ],
    True)

#--- STEMMER ---------------------------------------------------------------------------------------


def case_sensitive(stem, word):
    """ Applies the letter case of the word to the stem:
        Ponies => Poni
示例#14
0
def test_id_handling_pmc_oa():
    "Test every conceivable combination pmid/pmcid presence."
    db = get_db()
    pmc = PmcOA(ftp_url=TEST_FTP, local=True)

    # Initialize with all possible states we could have gotten from medline.
    pm_inp_tpl_list = capitalize_list_of_tpls(
        [('caseA%d' % i, 'PMCcaseA%d' % i)
         for i in range(2)] + [('caseB%d' % i, None)
                               for i in range(2)] + [(None, 'PMCcaseC%d' % i)
                                                     for i in range(2)] +
        [('caseMisMatchA',
          'PMCcaseMisMatchB'), ('caseMisMatchB', 'PMCcaseMisiMatchB'),
         ('caseMultiMatch', 'PMCcaseMultiMatch'), ('28884161', None),
         ('26977217', 'PMC4771487')])
    db.insert_many('text_ref',
                   [dict(zip(('pmid', 'pmcid'), d)) for d in pm_inp_tpl_list])

    # Prepare the 'batch' to be submitted for pmc oa, and try it.
    oa_inp_tpl_list = capitalize_list_of_tpls(
        [('case%s0' % l, 'PMCcase%s0' % l) for l in ['A', 'B', 'C']] +
        [(None, 'PMCcase%s1' % l) for l in ['A', 'B', 'C']] + [
            (None, 'PMC5579538'),  # lookup pmid in db
            (None, 'PMC4238023'),  # lookup no pmid in db
            ('26977217', 'PMC5142709'),  # conflicting pmcid
            ('caseMisMatchB', 'PMCcaseMisMatchA'),  # multiple matches
            ('caseMultiMatch', 'PMCnotmatching'),
            ('notmatching', 'PMCcaseMultiMatch'),
        ])
    tr_inp = []
    for pmid, pmcid in oa_inp_tpl_list:
        inp_dict = dict.fromkeys(pmc.tr_cols)
        inp_dict.update(pmcid=pmcid, pmid=pmid)
        tr_inp.append(inp_dict)
    tc_inp = [{
        'pmcid': pmcid,
        'text_type': 'txt',
        'content': b'content'
    } for _, pmcid in oa_inp_tpl_list]
    pmc.review_fname = 'test_review_%s.txt' % pmc.my_source
    pmc.upload_batch(db, tr_inp, tc_inp)

    # Check the text refs.
    expected_pairs = capitalize_list_of_tpls([
        ('caseA0', 'PMCcaseA0'),
        ('caseA1', 'PMCcaseA1'),
        ('caseB0', 'PMCcaseB0'),
        ('caseB1', None),  # in practice this should be resolved with id_lookup
        ('caseC0', 'PMCcaseC0'),
        (None, 'PMCcaseC1'),
        ('28884161', 'PMC5579538'),
        ('26977217', 'PMC4771487'),
        (None, 'PMCcaseB1'),
        ('25409783', 'PMC4238023'),
        ('caseMisMatchA', 'PMCcaseMisMatchB'),
        ('caseMisMatchB', 'PMCcaseMisiMatchB'),
        ('caseMultiMatch', 'PMCcaseMultiMatch'),
    ])
    actual_pairs = [(tr.pmid, tr.pmcid) for tr in db.select_all('text_ref')]
    assert_contents_equal(actual_pairs, expected_pairs,
                          'DB text refs incorrect.')

    with open(pmc.review_fname, 'r') as f:
        found_conflict_msg = False
        for line in f.read().splitlines():
            if all([
                    word in line for word in
                ['PMC4771487', 'PMC5142709', 'conflicting pmcid']
            ]):
                found_conflict_msg = True
                break
        assert found_conflict_msg

    # Check the text content
    assert len(db.select_all('text_content')) is 8, 'Too much DB text content.'
    remove(pmc.review_fname)
    return
示例#15
0
def build_set(n, parent_dir):
    """Create the nastiest set of content we're willing/able to handle.

    We create a small local representation of the entirety of the NLM
    repositories we use, including all the nasty corner cases we can manage.
    This allows for rapid development and testing.

    Parameters
    ----------
    n : int
        The number of instances (distinct articles) of each test case to be
        included. Examples are chosen as randomly as possible. Multiple samples
        generally increase the reliability of the test.
    parent_dir : str
        The head of the tree that stands in place of the url to the nih ftp
        directory.
    """

    # Create the necessary directories.
    def get_path(sub_path):
        return os.path.join(parent_dir, sub_path)

    if os.path.exists(parent_dir):
        shutil.rmtree(parent_dir)
    os.makedirs(parent_dir)
    os.makedirs(get_path('pub/pmc'))
    os.makedirs(get_path('pubmed/baseline'))
    os.makedirs(get_path('pub/pmc/manuscript'))

    # Get the pmid data from medline (med_pmid_list)
    print("Getting medline lists...")
    med_pmid_list = []
    med = Pubmed()
    for i in range(1, 7):
        buf = BytesIO()
        med.ftp.ret_file("MuId-PmId-%d.zip" % i, buf)
        zf = zipfile.ZipFile(buf)
        with zf.open(zf.namelist()[0]) as id_f:
            id_str = id_f.read().decode('utf8')
        med_pmid_list += [l.split('\t')[1] for l in id_str.splitlines()]

    statementful_pmids = [
        '20949557', '23898069', '19801969', '21042724', '14675752', '25897078',
        '25486481', '12890751', '11251186', '20622853', '25616414', '21878640',
        '23295773', '19747910', '25778309', '25939761', '11871856', '16580132',
        '24730770', '23921085', '22018470', '19405127', '21464949', '18321309',
        '7907095', '12048232', '23751074', '18711136', '13679391', '22193543',
        '26645886', '27086966', '14570914', '20538416', '9417079', '23200589',
        '15146469', '18084123', '19265534', '19449221', '27381626', '14976202',
        '22445724', '20040392', '26039245', '17881156', '15902258', '1745350',
        '18276758', '22764095', '20652941', '25834816', '23068100', '16407218',
        '18830263', '24265318', '19752028', '8589722', '22671588', '14745431',
        '25042645', '19403642', '14707024', '23536437', '21167476', '22801439',
        '25726184', '19723643', '17409824', '28679432', '26908611', '20164468',
        '15189946', '12086229', '21900397', '12324477', '15545228', '23376846',
        '21719749', '20608972', '23583295', '23236067', '9705962', '20068183',
        '19437340', '14534726', '25731731', '15337767', '28067895', '25092803',
        '19261749', '22272295', '27121230', '23302038', '17410335', '17399955',
        '16254247', '21685363', '26598524', '25645929', '1386335', '20606534',
        '22492281', '22158902', '22022427', '24775712', '21298412', '24753544',
        '12553064', '19681600', '17912454', '17597401', '20672986', '21362231',
        '17999917', '21470928', '27334922', '16159962', '21079653', '15125833',
        '27617579', '19048115', '18687691', '27797218', '26413934', '16684954',
        '20501406', '27515963', '22784503', '25941399', '12473120', '17891137',
        '16733295', '23826126', '21427728', '8900182', '26234677', '24648515',
        '25786138', '12958678', '16998791', '19061835', '11283269', '18258923',
        '11839584', '20132317', '19158374', '23245941', '23352210', '15465819',
        '15386433', '22575647', '15966238', '23633483', '25131797', '17102080',
        '19956840', '18506362', '17961162', '1607067', '24770328', '19825990',
        '22365656', '19720761', '24435975', '26882953', '17292826', '25119113',
        '26044620', '20717925', '15316008', '16619041', '19893488', '26999786',
        '26103054', '17331464', '20022966', '24189165', '19059939', '25474223',
        '20507346', '20976540', '2810532', '15685397', '27562587', '18538673',
        '15712349', '15448517', '27467210', '7584044', '21330319', '18381962',
        '24789704', '19058873', '10523313'
    ]

    elsevier_pmids = [
        "140233", "126700", "138421", "131864", "122916", "127363", "130834",
        "135691", "147139", "142190", "124378", "132969", "127549", "131583",
        "148910", "140686", "126304", "124909", "145863", "127687", "143909",
        "134286", "144524", "145955", "125088", "122895", "144611", "152202",
        "140767", "139895", "152644", "140057", "149561", "143963", "136992",
        "137557", "144535", "148891", "145321", "133684", "126386", "148890",
        "124210", "131711", "124967", "138753", "132192", "142510", "130244",
        "123485", "126883", "151536", "126948", "137419", "141952", "130051",
        "122816", "150450", "133686", "126866", "138748", "149542", "144038",
        "145957", "136213", "148513", "141931", "140056", "139935", "123177",
        "124593", "141942", "133729", "124598", "124252", "126303", "152671",
        "141908", "124625", "152721", "150335", "133685", "150977", "124154",
        "140713", "146095", "123742", "140478", "143938", "140806", "124600",
        "123729", "127548", "145041", "139938", "143289", "131554", "125206",
        "142661", "122933"
    ]

    # Get the data from pmc oa (pmc_dicts)
    print("Getting pmc oa lists....")
    pmc = PmcOA()
    pmc_dicts = pmc.ftp.get_csv_as_dict('oa_file_list.csv', header=0)

    # Get the data for the manuscripts (man_dicts)
    print("Getting manuscript lists...")
    man = Manuscripts()
    man_dicts = man.ftp.get_csv_as_dict('filelist.csv', header=0)

    # Get pmid, pmcid, mid tuples for the examples that we will use.
    print("Generating example sets...")
    examples = []
    for case in [(1, 0, 0), (1, 1, 0), (0, 1, 0), (1, 1, 1), (1, 0, 1)]:
        for _ in range(n):
            example = _get_example(case, med_pmid_list, pmc_dicts, man_dicts)
            examples.append(example)

    # Add a few pmids that probably include some statements.
    for pmid in random.sample(statementful_pmids, n):
        examples.append((pmid, '', ''))

    # Add a few pmids that link to elsevier content
    for pmid in random.sample(elsevier_pmids, n):
        examples.append((pmid, '', ''))

    # Add a special article to check article info.
    double_doi_info = med.get_article_info('baseline/pubmed18n0343.xml.gz')
    pmids_w_double_doi = [
        k for k, v in double_doi_info.items()
        if v['doi'] is not None and len(v['doi']) > 100
    ]
    assert len(pmids_w_double_doi), "No double dois found."
    examples.append((
        random.choice(pmids_w_double_doi),
        '',
        '',
    ))

    # Create the test medline file.
    print("Creating medline test file...")
    pmid_list = [pmid for pmid, _, _ in examples if pmid != '']
    tree = None
    for pmid in pmid_list:
        params = {'db': 'pubmed', 'retmode': 'xml', 'id': pmid}
        if tree is None:
            tree = pub.send_request(pub.pubmed_fetch, params)
        else:
            child = pub.send_request(pub.pubmed_fetch, params).getchildren()[0]
            tree.append(child)
    if tree is not None:
        f_bts = b''
        f_bts += b"<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
        f_bts += ET.tostring(tree)
        f_path = get_path('pubmed/baseline/pubmed18nTEST.xml.gz')
        with open(f_path, 'wb') as gzf:
            gzf.write(gzip.compress(f_bts))

    # Create the test pmc oa article directory.
    print("Getting pmc oa xmls...")
    art_dirname = get_path('pub/pmc/articles.TEST.xml')
    if os.path.exists(art_dirname):
        shutil.rmtree(art_dirname)
    os.mkdir(art_dirname)
    pmcid_list = [pmcid for _, pmcid, _ in examples if pmcid != '']
    ex_pmc_dicts = [d for d in pmc_dicts if d['Accession ID'] in pmcid_list]
    for d in ex_pmc_dicts:
        fname = pmc.ftp.download_file(d['File'])
        with tarfile.open(fname, 'r:gz') as tar:
            mems = tar.getmembers()
            mem = [mem for mem in mems if mem.name.endswith('.nxml')][0]
            f_str = tar.extractfile(mem).read()
        fname = d['Accession ID'] + '.nxml'
        re_ret = re.findall('<journal-title>(.*?)</journal-title>',
                            f_str.decode('utf8'))
        if len(re_ret):
            sub_dir = os.path.join(
                art_dirname, re_ret[0].replace(' ', '_').replace('&', ''))
        else:
            sub_dir = os.path.join(art_dirname, 'unknown')
        if not os.path.exists(sub_dir):
            os.mkdir(sub_dir)
        path = os.path.join(sub_dir, fname)
        with open(path, 'wb') as f:
            f.write(f_str)
    with tarfile.open(art_dirname + '.tar.gz', 'w:gz') as tar:
        for dirname in os.listdir(art_dirname):
            tar.add(os.path.join(art_dirname, dirname), arcname=dirname)
    shutil.rmtree(art_dirname)

    # Create deleted pmids file (just make an empty file,for now.
    # TODO: Add test case to touch this.
    with open(get_path('pubmed/deleted.pmids.gz'), 'wb') as gzf:
        gzf.write(gzip.compress(b''))

    # Create the test manuscripts file.
    print('Adding manuscript directories...')
    dirfmt = get_path('pub/pmc/manuscript/%s')
    dirnames = [dirfmt % ('PMC00%dXXXXXX.xml' % i) for i in range(2, 6)]
    for dirname in dirnames:
        if os.path.exists(dirname):
            shutil.rmtree(dirname)
        os.mkdir(dirname)
    ex_man_dicts = [d for d in man_dicts if d['PMCID'] in pmcid_list]
    for d in ex_man_dicts:
        d['Tarfile'] = man.get_tarname_from_filename(d['File'])
    tar_members = dict.fromkeys(set([d['Tarfile'] for d in ex_man_dicts]))
    for tarname in tar_members.keys():
        if not os.path.exists(tarname):
            print("\tDownloading %s..." % tarname)
            man.ftp.download_file(tarname)
    for d in ex_man_dicts:
        parent_dir = os.path.join(dirfmt % tarname.replace('.tar.gz', ''),
                                  os.path.dirname(d['File']))
        test_fname = os.path.join(dirfmt % tarname.replace('.tar.gz', ''),
                                  d['File'])
        with tarfile.open(d['Tarfile'], 'r:gz') as tar:
            print('\tExtracting %s from %s...' % (d['File'], d['Tarfile']))
            tar.extract(d['File'])
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
        os.rename(d['File'], test_fname)
    for dirname in dirnames:
        with tarfile.open(dirname + '.tar.gz', 'w:gz') as tar:
            for sub_dirname in os.listdir(dirname):
                tar.add(os.path.join(dirname, sub_dirname),
                        arcname=sub_dirname)
        shutil.rmtree(dirname)

    return examples
示例#16
0
def d(*args):
    return dict.fromkeys(args, True)
示例#17
0
文件: stemmer.py 项目: clips/pattern
    "innings": "inning",
    "outings": "outing",
    "cannings": "canning",
    "idly": "idl",
    "gently": "gentl",
    "ugly": "ugli",
    "early": "earli",
    "only": "onli",
    "singly": "singl"
}

# Words that are never stemmed:
uninflected = dict.fromkeys([
    "sky",
    "news",
    "howe",
    "inning", "outing", "canning",
    "proceed", "exceed", "succeed",
    "atlas", "cosmos", "bias", "andes" # not plural forms
], True)

#--- STEMMER ---------------------------------------------------------------------------------------


def case_sensitive(stem, word):
    """ Applies the letter case of the word to the stem:
        Ponies => Poni
    """
    ch = []
    for i in range(len(stem)):
        if word[i] == word[i].upper():
            ch.append(stem[i].upper())
示例#18
0
 def remove_duplicates(values: List[str]) -> List[str]:
     return list(dict.fromkeys(values))
示例#19
0
文件: search.py 项目: clips/pattern
        try:
            return [w.synonyms[0] for w in self.wordnet.synsets(word, pos[:2])[0].hypernyms()]
        except:
            pass

#from en import wordnet
#taxonomy.classifiers.append(WordNetClassifier(wordnet))
#print(taxonomy.parents("ponder", pos="VB"))
#print(taxonomy.children("computer"))

#### PATTERN #######################################################################################

#--- PATTERN CONSTRAINT ----------------------------------------------------------------------------

# Allowed chunk, role and part-of-speech tags (Penn Treebank II):
CHUNKS = dict.fromkeys(["NP", "PP", "VP", "ADVP", "ADJP", "SBAR", "PRT", "INTJ"], True)
ROLES = dict.fromkeys(["SBJ", "OBJ", "PRD", "TMP", "CLR", "LOC", "DIR", "EXT", "PRP"], True)
TAGS = dict.fromkeys(["CC", "CD", "CJ", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "JJ*",
                        "LS", "MD", "NN", "NNS", "NNP", "NNP*", "NNPS", "NN*", "NO", "PDT", "PR",
                        "PRP", "PRP$", "PR*", "PRP*", "PT", "RB", "RBR", "RBS", "RB*", "RP",
                        "SYM", "TO", "UH", "VB", "VBZ", "VBP", "VBD", "VBN", "VBG", "VB*",
                        "WDT", "WP*", "WRB", "X", ".", ",", ":", "(", ")"], True)

ALPHA = re.compile("[a-zA-Z]")
has_alpha = lambda string: ALPHA.match(string) is not None


class Constraint(object):

    def __init__(self, words=[], tags=[], chunks=[], roles=[], taxa=[], optional=False, multiple=False, first=False, taxonomy=TAXONOMY, exclude=None, custom=None):
        """ A range of words, tags and taxonomy terms that matches certain words in a sentence.        
示例#20
0
from builtins import dict

N1 = list(input())
ss = input()
len_N2 = len(set(ss.replace('*', '')))
N2 = list(ss)
dic1 = dict.fromkeys(list(set(N1 + N2)), 0)
dic2 = dict.fromkeys(list(set(N1 + N2)), 0)

for i, j in zip(N1, N2):
    dic1[i] += 1
    dic2[j] += 1
star_diff = 0
if '*' in N2:
    star_diff = dic2['*']
val = 0
if sorted(dic1.keys()) == sorted(dic2.keys()) or abs(len_N2-len(set(N1))) >= star_diff:
    for i, j in zip(sorted(dic1.items()), sorted(dic2.items())):
        if i[1] < j[1]:
            val += j[1] - i[1]
    if star_diff >= val:
        print('A')
    else:
        print('N')
示例#21
0
文件: modality.py 项目: clips/pattern
def d(*args):
    return dict.fromkeys(args, True)
示例#22
0
if __name__ == '__main__':
    parser = ArgumentParser(
        description='Assemble many pickle files into one.'
        )
    parser.add_argument(
        '-r', '--readers',
        dest='readers',
        nargs='+',
        help='Choose which reader(s) to use.'
        )
    parser.add_argument(
        dest='file_list',
        nargs='+',
        help='A list of file paths.'
        )
    args = parser.parse_args()

    all_stmts = dict.fromkeys(args.readers)
    for k in all_stmts.keys():
        all_stmts[k] = {}

    for file in args.file_list:
        with open(file, 'rb') as f:
            stmts = pickle.load(f)
        for reader in args.readers:
            all_stmts[reader].update(stmts[reader])

    with open('reading_stmts.pkl', 'wb') as f:
        pickle.dump(all_stmts, f)
示例#23
0
    def _report_timing(self, timing_info):
        # Pivot the timing info.
        idx_patt = re.compile('%s_(\d+)_(\d+)' % self.basename)
        job_segs = NestedDict()
        plot_set = set()
        for stage, stage_d in timing_info.items():
            # e.g. reading, statement production...
            for metric, metric_d in stage_d.items():
                # e.g. start, end, ...
                for job_name, t in metric_d.items():
                    # e.g. job_basename_startIx_endIx
                    job_segs[job_name][stage][metric] = t
                    m = idx_patt.match(job_name)
                    if m is None:
                        logger.error("Unexpectedly formatted name: %s." %
                                     job_name)
                        continue
                    key = tuple([int(n) for n in m.groups()] + [job_name])
                    plot_set.add(key)
        plot_list = list(plot_set)
        plot_list.sort()

        # Use this for getting the minimum and maximum.
        all_times = [
            dt for job in job_segs.values() for stage in job.values()
            for metric, dt in stage.items() if metric != 'duration'
        ]
        all_start = min(all_times)
        all_end = max(all_times)

        def get_time_tuple(stage_data):
            start_seconds = (stage_data['start'] - all_start).total_seconds()
            return start_seconds, stage_data['duration'].total_seconds()

        # Make the broken barh plots.
        w = 6.5
        h = 9
        fig = plt.figure(figsize=(w, h))
        gs = plt.GridSpec(2, 1, height_ratios=[10, 1])
        ax0 = plt.subplot(gs[0])
        ytick_pairs = []
        stages = ['reading', 'statement production', 'stats']
        t = arange((all_end - all_start).total_seconds())
        counts = dict.fromkeys(['jobs'] + stages)
        for k in counts.keys():
            counts[k] = array([0 for _ in t])
        for i, job_tpl in enumerate(plot_list):
            s_ix, e_ix, job_name = job_tpl
            job_d = job_segs[job_name]
            xs = [get_time_tuple(job_d[stg]) for stg in stages]
            ys = (s_ix, (e_ix - s_ix) * 0.9)
            ytick_pairs.append(((s_ix + e_ix) / 2, '%s_%s' % (s_ix, e_ix)))
            logger.debug("Making plot for: %s" % str((job_name, xs, ys)))
            ax0.broken_barh(xs, ys, facecolors=('red', 'green', 'blue'))

            for n, stg in enumerate(stages):
                cs = counts[stg]
                start = xs[n][0]
                dur = xs[n][1]
                cs[(t > start) & (t < (start + dur))] += 1
            cs = counts['jobs']
            cs[(t > xs[0][0]) & (t < (xs[-1][0] + xs[-1][1]))] += 1

        # Format the plot
        ax0.tick_params(top='off',
                        left='off',
                        right='off',
                        bottom='off',
                        labelleft='on',
                        labelbottom='off')
        for spine in ax0.spines.values():
            spine.set_visible(False)
        total_time = (all_end - all_start).total_seconds()
        ax0.set_xlim(0, total_time)
        ax0.set_ylabel(self.basename + '_ ...')
        print(ytick_pairs)
        yticks, ylabels = zip(*ytick_pairs)
        print(yticks)
        if not self.ids_per_job:
            print([yticks[i + 1] - yticks[i] for i in range(len(yticks) - 1)])
            # Infer if we don't have it.
            spacing = median(
                [yticks[i + 1] - yticks[i] for i in range(len(yticks) - 1)])
            spacing = max(1, spacing)
        else:
            spacing = self.ids_per_job
        print(spacing)
        print(yticks[0], yticks[-1])
        ytick_range = list(arange(yticks[0], yticks[-1] + spacing, spacing))
        ylabel_filled = []
        for ytick in ytick_range:
            if ytick in yticks:
                ylabel_filled.append(ylabels[yticks.index(ytick)])
            else:
                ylabel_filled.append('FAILED')
        ax0.set_ylim(0, max(ytick_range) + spacing)
        ax0.set_yticks(ytick_range)
        ax0.set_yticklabels(ylabel_filled)

        # Plot the lower axis.
        legend_list = []
        color_map = {
            'jobs': 'k',
            'reading': 'r',
            'statement production': 'g',
            'stats': 'b'
        }
        ax1 = plt.subplot(gs[1], sharex=ax0)
        for k, cs in counts.items():
            legend_list.append(k)
            ax1.plot(t, cs, color=color_map[k])
        for lbl, spine in ax1.spines.items():
            spine.set_visible(False)
        max_n = max(counts['jobs'])
        ax1.set_ylim(0, max_n + 1)
        ax1.set_xlim(0, total_time)
        yticks = list(range(0, max_n - max_n // 5, max(1, max_n // 5)))
        ax1.set_yticks(yticks + [max_n])
        ax1.set_yticklabels([str(n) for n in yticks] + ['max=%d' % max_n])
        ax1.set_ylabel('N_jobs')
        ax1.set_xlabel('Time since beginning [seconds]')

        # Make the figue borders more sensible.
        fig.tight_layout()
        img_path = 'time_figure.png'
        fig.savefig(img_path)
        self.reporter.add_image(img_path, width=w, height=h, section='Plots')
        return
示例#24
0
if includes:
    aux += includes

strIncs = [str(i) for i in aux]

listToExclude = []

if (USE_EXCLUDE_FOLDERS):
    
    allIncFoldes = []
    includes = [str(i) for i in includes]

    for filename in Path('.').rglob('*.h'):
        allIncFoldes.append(str(filename.parent))

    allIncFoldes = list(dict.fromkeys(allIncFoldes))
    allIncFoldes.sort()

    filtedist = []
    parent = ""
    for p in allIncFoldes:
        if parent == "":
            parent = p
            filtedist.append(p)
        elif(p.startswith(parent+"/")):
            None
        else:
            parent = p
            filtedist.append(p)

    allIncFoldes = filtedist
from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import pickle
from argparse import ArgumentParser

if __name__ == '__main__':
    parser = ArgumentParser(description='Assemble many pickle files into one.')
    parser.add_argument('-r',
                        '--readers',
                        dest='readers',
                        nargs='+',
                        help='Choose which reader(s) to use.')
    parser.add_argument(dest='file_list',
                        nargs='+',
                        help='A list of file paths.')
    args = parser.parse_args()

    all_stmts = dict.fromkeys(args.readers)
    for k in all_stmts.keys():
        all_stmts[k] = {}

    for file in args.file_list:
        with open(file, 'rb') as f:
            stmts = pickle.load(f)
        for reader in args.readers:
            all_stmts[reader].update(stmts[reader])

    with open('reading_stmts.pkl', 'wb') as f:
        pickle.dump(all_stmts, f)
示例#26
0
            ]
        except:
            pass


#from en import wordnet
#taxonomy.classifiers.append(WordNetClassifier(wordnet))
#print(taxonomy.parents("ponder", pos="VB"))
#print(taxonomy.children("computer"))

#### PATTERN #######################################################################################

#--- PATTERN CONSTRAINT ----------------------------------------------------------------------------

# Allowed chunk, role and part-of-speech tags (Penn Treebank II):
CHUNKS = dict.fromkeys(
    ["NP", "PP", "VP", "ADVP", "ADJP", "SBAR", "PRT", "INTJ"], True)
ROLES = dict.fromkeys(
    ["SBJ", "OBJ", "PRD", "TMP", "CLR", "LOC", "DIR", "EXT", "PRP"], True)
TAGS = dict.fromkeys([
    "CC", "CD", "CJ", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "JJ*", "LS",
    "MD", "NN", "NNS", "NNP", "NNP*", "NNPS", "NN*", "NO", "PDT", "PR", "PRP",
    "PRP$", "PR*", "PRP*", "PT", "RB", "RBR", "RBS", "RB*", "RP", "SYM", "TO",
    "UH", "VB", "VBZ", "VBP", "VBD", "VBN", "VBG", "VB*", "WDT", "WP*", "WRB",
    "X", ".", ",", ":", "(", ")"
], True)

ALPHA = re.compile("[a-zA-Z]")
has_alpha = lambda string: ALPHA.match(string) is not None


class Constraint(object):
示例#27
0
    def validate(self):
        Salle = self.salleComboBox.currentText()
        PC = self.pcComboBox.currentText()
        dateMinimum = self.dateDebutLineEdit.text()
        dateMaximum = self.dateFinLineEdit.text()
        self.ui.resultatTextEdit.clear()

        if dateMinimum < dateMaximum:

            with open(self.chemin, newline='') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=';', quotechar='|')

                next(spamreader, None)
                connexion = 0
                connexionTotale = 0
                connexionTotaleSalle = 0
                PCCollege = []
                self.rowListe = []
                self.rowListe.append("<!doctype html>")
                self.rowListe.append("<html>")
                self.rowListe.append("<head>")
                self.rowListe.append('<meta charset="utf-8">')
                self.rowListe.append(
                    '<link rel="stylesheet" href="https://cdn.datatables.net/1.10.20/css/jquery.dataTables.min.css">'
                )
                self.rowListe.append(
                    '<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>'
                )
                self.rowListe.append(
                    '<script src="https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js"></script>'
                )
                self.rowListe.append("</head>")
                self.rowListe.append("<body>")
                self.rowListe.append(
                    '<table id="resultats"><thead><tr><th>Machine Id</th><th>Etat</th><th>Utilisateur</th><th>Date</th><th>Heure</th></tr></thead><tbody>'
                )

                for row in spamreader:
                    date = row[3]
                    pc = row[0]

                    #PC de la salle
                    if dateMinimum <= date and dateMaximum >= date and pc == PC and row[
                            1] == "1":
                        connexion = connexion + 1

                    #Salle
                    if dateMinimum <= date and dateMaximum >= date and row[
                            1] == "1" and (pc.split("-")[1] == Salle
                                           or Salle == "..."):
                        connexionTotaleSalle = connexionTotaleSalle + 1
                        nombrePC = self.pcComboBox.count()

                    #College
                    if dateMinimum <= date and dateMaximum >= date and row[
                            1] == "1":
                        connexionTotale = connexionTotale + 1
                        PCCollege.append(row[0])

                    if dateMinimum <= date and dateMaximum >= date and pc == PC:
                        self.rowListe.append("<tr><td>" + row[0] + "</td>" +
                                             "<td>" + row[1] + "</td>" +
                                             "<td>" + row[2] + "</td>" +
                                             "<td>" + row[3] + "</td>" +
                                             "<td>" + row[4] + "</td>" +
                                             "</tr>")

                    if dateMinimum <= date and dateMaximum >= date and pc.split(
                            "-")[1] == Salle and PC == "...":
                        self.rowListe.append("<tr><td>" + row[0] + "</td>" +
                                             "<td>" + row[1] + "</td>" +
                                             "<td>" + row[2] + "</td>" +
                                             "<td>" + row[3] + "</td>" +
                                             "<td>" + row[4] + "</td>" +
                                             "</tr>")

                    if dateMinimum <= date and dateMaximum >= date and Salle == "...":
                        self.rowListe.append("<tr><td>" + row[0] + "</td>" +
                                             "<td>" + row[1] + "</td>" +
                                             "<td>" + row[2] + "</td>" +
                                             "<td>" + row[3] + "</td>" +
                                             "<td>" + row[4] + "</td>" +
                                             "</tr>")

                PCCollege = list(dict.fromkeys(PCCollege))
                nombrePCUtilisie = len(PCCollege)

                self.rowListe.append("</tbody></table>")
                self.rowListe.append("<script>")
                self.rowListe.append("$(document).ready( function () {")
                self.rowListe.append("$('#resultats')")
                self.rowListe.append(".addClass( 'nowrap' )")
                self.rowListe.append(".dataTable( {")
                self.rowListe.append("responsive: true")
                self.rowListe.append("} );")
                self.rowListe.append("} );")
                self.rowListe.append("</script>")
                self.rowListe.append("</html>")

            self.bilan(connexion, dateMinimum, dateMaximum, PC, Salle,
                       connexionTotale, connexionTotaleSalle, nombrePC,
                       nombrePCUtilisie)